library(tidyverse); library(cowplot)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.3     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(phyloseq); library(decontam)
library(compositions); library(patchwork); library(ggupset)
## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
## 
## Attaching package: 'compositions'
## The following objects are masked from 'package:stats':
## 
##     anova, cor, cov, dist, var
## The following objects are masked from 'package:base':
## 
##     %*%, norm, scale, scale.default
## 
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
## 
##     align_plots

1 Import exisiting data

Datasets included: * Gorda Ridge 2019 cruise * Axial seamount timeseries * Mid-Cayman Rise 2020 cruisee

1.1 Merged data from QIIME2 analysis

merged_tax <- read_delim("data-input/taxonomy.tsv", delim = "\t")
merged_asv <- read_delim("data-input/microeuk-merged-asv-table.tsv", delim = "\t", skip = 1)
# head(merged_tax)
metadata <- read.delim("data-input/samplelist-metadata.txt")
# head(metadata)
asv_wtax <- merged_asv %>%
  select(FeatureID = '#OTU ID', everything()) %>%
  pivot_longer(cols = !FeatureID,
               names_to = "SAMPLE", values_to = "value") %>%
  left_join(merged_tax, by = c("FeatureID" = "Feature ID")) %>%
  left_join(metadata) %>%
  filter(!grepl("Siders_", SAMPLE)) %>% 
  filter(SAMPLETYPE != "Incubation") %>% 
  filter(SAMPLETYPE != "Microcolonizer") %>%
  mutate(DATASET = case_when(
    grepl("_GR_", SAMPLE) ~ "GR",
    grepl("Gorda", SAMPLE) ~ "GR",
    grepl("_MCR_", SAMPLE) ~ "MCR",
    grepl("Axial", SAMPLE) ~ "Axial",
  TRUE ~ "Control or blank")) %>%
    separate(Taxon, c("Domain", "Supergroup",
                  "Phylum", "Class", "Order",
                  "Family", "Genus", "Species"), sep = ";", remove = FALSE) %>% 
  unite(SAMPLENAME, SITE, SAMPLETYPE, YEAR, VENT, SAMPLEID, sep = " ", remove = FALSE)
## Joining, by = "SAMPLE"
## Warning: Expected 8 pieces. Additional pieces discarded in 154116 rows [163,
## 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
## 180, 181, 182, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 140238 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# View(asv_wtax)
# head(asv_wtax) ## Complete ASV table with full taxonomy names and annotated sample information

1.1.1 Total number of sequences

Barplots to show total number of sequences and total number of ASVs

# head(asv_wtax)
plot_grid(
  # Total number of ASVs
  asv_wtax %>% 
  filter(value > 0) %>% 
  ggplot(aes(x = SAMPLENAME)) +
  geom_bar(stat = "count", width = 0.9) +
  labs(y = "Total ASVs per sample", x = "") +
  coord_flip() +
    scale_y_continuous(position = "right") +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 1),
        axis.text.y = element_text(angle = 0, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")),
  asv_wtax %>% 
  group_by(SAMPLENAME, SITE, Domain, DATASET) %>% 
  summarise(SUM_SEQ_DOMAIN = sum(value)) %>% 
  ggplot(aes(x = SAMPLENAME, y = SUM_SEQ_DOMAIN, fill = Domain)) +
  geom_bar(stat = "identity", color = "black", width = 0.9) +
  labs(y = "Total sequences per sample", x = "") +
  coord_flip() +
  scale_y_continuous(position = "right") +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 1),
        axis.text.y = element_text(angle = 0, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "right"),
  ncol = 2, align = c("hv"), axis = c("lr"))
## `summarise()` has grouped output by 'SAMPLENAME', 'SITE', 'Domain'. You can override using the `.groups` argument.

1.1.2 Table reporting total ASVs and sequences

# View(asv_wtax %>% filter(value > 0) %>%
#        group_by(SAMPLENAME, DATASET, SITE) %>%
#        summarise(SEQ_SUM = sum(value),
#                  ASV_COUNT = n()))

1.2 Decontaminate sequence library

Import sample description text file, import as phyloseq library, and remove potential contaminate ASVs and sequences. Catalog total number of ASVs and sequences removed from analysis.

# library(decontam); library(phyloseq)

1.2.1 Import as phyloseq objects

tax_matrix <- merged_tax %>% 
  select(FeatureID = `Feature ID`, Taxon) %>% 
  separate(Taxon, c("Domain", "Supergroup", 
                  "Phylum", "Class", "Order",
                  "Family", "Genus", "Species"), sep = ";", remove = FALSE) %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix
## Warning: Expected 8 pieces. Additional pieces discarded in 2854 rows [2, 7, 8,
## 11, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29, 32, 33, 34, 35, 36, 37, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 2597 rows [1, 3,
## 4, 5, 6, 9, 10, 12, 13, 21, 22, 23, 24, 25, 26, 30, 31, 39, 40, 42, ...].
asv_matrix <- merged_asv %>% 
  select(FeatureID = '#OTU ID', everything()) %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix

# Align row names for each matrix
rownames(tax_matrix) <- row.names(asv_matrix)

# Set rownames of metadata table to SAMPLE information
row.names(metadata) <- metadata$SAMPLE
# Import asv and tax matrices
ASV = otu_table(asv_matrix, taxa_are_rows = TRUE)
TAX = tax_table(tax_matrix)
phylo_obj <- phyloseq(ASV, TAX)

# Import metadata as sample data in phyloseq
samplenames <- sample_data(metadata)

# join as phyloseq object
physeq_wnames = merge_phyloseq(phylo_obj, samplenames)
# colnames(ASV)

## Check
# physeq_wnames

1.2.2 Identify contaminant ASVs

# When "Control" appears in "Sample_or_Control column, this is a negative control"
sample_data(physeq_wnames)$is.neg <- sample_data(physeq_wnames)$Sample_or_Control == "Control"
# ID contaminants using Prevalence information
contam_prev <- isContaminant(physeq_wnames, 
                               method="prevalence", 
                               neg="is.neg", 
                               threshold = 0.5, normalize = TRUE) 
## Warning in .is_contaminant(seqtab, conc = conc, neg = neg, method = method, :
## Removed 2 samples with zero total counts (or frequency).

## Warning in .is_contaminant(seqtab, conc = conc, neg = neg, method = method, :
## Removed 2 samples with zero total counts (or frequency).
# Report number of ASVs IDed as contamintants
table(contam_prev$contaminant)
## 
## FALSE  TRUE 
##  5412    39

0.5 - this threshold will ID contaminants in all samples that are more prevalent in negative controls than in positive samples.

1.2.3 Remove problematic ASVs

# Subset contaminant ASVs
contams <- filter(contam_prev, contaminant == "TRUE")
list_of_contam_asvs <- as.character(row.names(contams))
# length(list_of_contam_asvs)

taxa_contam <- as.data.frame(tax_matrix) %>% 
  rownames_to_column(var = "FeatureID") %>% 
  filter(FeatureID %in% list_of_contam_asvs)
# head(taxa_contam)
# View(asv_wtax)
asv_wtax_decon <- asv_wtax %>% 
  filter(!(FeatureID %in% list_of_contam_asvs)) %>% 
  filter(!(Sample_or_Control == "Control"))

tmp_orig <- (asv_wtax %>% filter(!(Sample_or_Control == "Control")))

# Stats on lost
x <- length(unique(tmp_orig$FeatureID)); x
## [1] 5451
y <- length(unique(asv_wtax_decon$FeatureID)); y
## [1] 5412
100*((y-x)/x) # 42 total ASVs lost
## [1] -0.7154651
a <- sum(tmp_orig$value);a #4.55 million
## [1] 2101838
b <- sum(asv_wtax_decon$value);b #4.14 million 
## [1] 2044985
100*((b-a)/a)
## [1] -2.704918
# Lost 9% of sequences from whole dataset.

## Subsample to clean ASVs
asv_wtax_wstats <- asv_wtax %>% 
  mutate(DECONTAM = case_when(
    FeatureID %in% list_of_contam_asvs ~ "FAIL",
    TRUE ~ "PASS"
  ))
plot_grid(asv_wtax_wstats %>% 
  filter(value > 0) %>% 
  ggplot(aes(x = SAMPLE, fill = DECONTAM)) +
  geom_bar(stat = "count", width = 0.9, color = "black") +
  labs(y = "Total ASVs") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  asv_wtax_wstats %>% 
  group_by(SAMPLE, SITE, DECONTAM, DATASET) %>% 
  summarise(SUM_SEQ_DOMAIN = sum(value)) %>% 
  ggplot(aes(x = SAMPLE, y = SUM_SEQ_DOMAIN, fill = DECONTAM)) +
  geom_bar(stat = "identity", color = "black", width = 0.9) +
  labs(y = "Total Sequences") +
  coord_flip() +
  theme_linedraw() +
  facet_grid(DATASET ~ ., scale = "free", space = "free") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "bottom"),
  ncol = 2)
## `summarise()` has grouped output by 'SAMPLE', 'SITE', 'DECONTAM'. You can override using the `.groups` argument.

2 Survey of sequences from in situ samples

colnames(asv_wtax_wstats)
##  [1] "FeatureID"         "SAMPLE"            "value"            
##  [4] "Taxon"             "Domain"            "Supergroup"       
##  [7] "Phylum"            "Class"             "Order"            
## [10] "Family"            "Genus"             "Species"          
## [13] "Consensus"         "SAMPLENAME"        "VENT"             
## [16] "COORDINATES"       "SITE"              "Sample_or_Control"
## [19] "SAMPLEID"          "DEPTH"             "SAMPLETYPE"       
## [22] "YEAR"              "TEMP"              "pH"               
## [25] "PercSeawater"      "Mg"                "NO3"              
## [28] "H2"                "H2S"               "CH4"              
## [31] "ProkConc"          "Sample_actual"     "Type"             
## [34] "DATASET"           "DECONTAM"
sites <- c("Piccard", "VonDamm", "Axial", "GordaRidge")
asv_insitu <- asv_wtax_wstats %>% filter(Sample_or_Control != "Control") %>% 
       filter(SITE %in% sites) %>% 
       filter(!grepl("_expTf_", SAMPLE)) %>% 
       filter(value > 0) %>% 
       filter(DECONTAM == "PASS")

# Get quick stats on totals
sum(asv_insitu$value) # 2million sequences
## [1] 2044985
length(unique(asv_insitu$FeatureID)) #2923 ASVs
## [1] 2925

Additional sample QC, check replicates, and determine if replicates should be averaged.

plot_grid(asv_insitu %>% 
  group_by(SAMPLENAME, VENT, DATASET, Domain) %>% 
  summarise(seqsum_var = sum(value),
            asvcount_var = n()) %>% 
  pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>% 
  ggplot(aes(x = SAMPLENAME, y = value, fill = Domain)) +
    geom_bar(color = "black", stat = "identity", position = "fill") +
    facet_grid(VARIABLE ~ DATASET, space = "free", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme_linedraw() +
  scale_fill_brewer(palette = "Paired") +
  theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
        legend.position = "bottom"),
  asv_insitu %>% 
  group_by(SAMPLENAME, VENT, DATASET, Domain) %>% 
  summarise(seqsum_var = sum(value),
            asvcount_var = n()) %>% 
  pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>% 
  ggplot(aes(x = SAMPLENAME, y = value, fill = Domain)) +
    geom_bar(color = "black", stat = "identity", position = "stack") +
    facet_grid(VARIABLE ~ DATASET, space = "free_x", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme_linedraw() +
  scale_fill_brewer(palette = "Paired") +
  theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
        axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
        legend.position = "bottom"),
ncol = 2)
## `summarise()` has grouped output by 'SAMPLENAME', 'VENT', 'DATASET'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME', 'VENT', 'DATASET'. You can override using the `.groups` argument.

2.1 Base bar plot - taxonomy

asv_insitu %>% 
  filter(Domain == "Eukaryota") %>%
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Supergroup) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  ggplot(aes(x = VENT, y = SEQ_SUM, fill = Supergroup)) +
    geom_bar(stat = "identity", position = "stack", color = "black", width = 0.9) +
    facet_grid(. ~ SITE + SAMPLETYPE, scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
    "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
    "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
    "#807dba", "#54278f", "#bdbdbd", "black"))
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT'. You can override using the `.groups` argument.

Repeat taxonomy barplot, but with relative abundance

asv_insitu %>% 
  filter(Domain == "Eukaryota") %>%
  # unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  group_by(SITE, SAMPLETYPE, VENT, Supergroup) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  ggplot(aes(x = VENT, y = SEQ_SUM, fill = Supergroup)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
    facet_grid(. ~ SITE + SAMPLETYPE, scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black")) +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
    "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
    "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
    "#807dba", "#54278f", "#bdbdbd", "black"))
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT'. You can override using the `.groups` argument.

3 Removal of samples - QC

# head(asv_insitu)
# View(asv_insitu)
# length(unique(asv_insitu$FeatureID))
# View(asv_insitu %>% filter(value > 0) %>%
#        group_by(SAMPLENAME, DATASET, SITE) %>%
#        summarise(SEQ_SUM = sum(value),
#                  ASV_COUNT = n_distinct(FeatureID)))

4 Diversity estimators

DivNet package - diversity estimation hypothesis testing from Amy Willis’s group. This will also characterize the uncertainty of the richness estimate. Richness estimation is flawed because of sample depth and processing methods.

library(phyloseq); library(breakaway); library(DivNet)

# Select eukaryotes only and create wide format dataframe
insitu_wide <- asv_insitu %>% 
  filter(Domain == "Eukaryota") %>% 
  filter(!grepl("_Plume001_", SAMPLE)) %>% #removing "near vent background", not relevant in other data sets
  select(FeatureID, Taxon, SAMPLE, value) %>%
  pivot_wider(names_from = SAMPLE, values_from = value, values_fill = 0)

# head(insitu_wide)
insitu_samples <- as.character(colnames(insitu_wide %>% select(-Taxon, -FeatureID)))
# insitu_samples
insitu_tax_matrix <- insitu_wide %>%
  select(FeatureID, Taxon) %>% 
  separate(Taxon, c("Domain", "Supergroup", 
                  "Phylum", "Class", "Order",
                  "Family", "Genus", "Species"), sep = ";") %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix
## Warning: Expected 8 pieces. Additional pieces discarded in 1600 rows [2, 3, 4,
## 6, 9, 10, 11, 12, 13, 14, 17, 19, 20, 21, 22, 23, 26, 30, 32, 36, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 937 rows [1, 5,
## 7, 8, 15, 16, 18, 24, 25, 27, 28, 29, 31, 33, 34, 35, 37, 45, 47, 48, ...].
insitu_asv_matrix <- insitu_wide %>% 
  select(-Taxon) %>% 
  column_to_rownames(var = "FeatureID") %>% 
  as.matrix

# Align row names for each matrix
rownames(insitu_tax_matrix) <- row.names(insitu_asv_matrix)

## Extract relevant metadata information
# head(metadata)
metadata_insitu <- metadata %>%
  filter(SAMPLE %in% insitu_samples) %>% # from reformatting df above
  select(SAMPLE, VENT, SITE, SAMPLETYPE, YEAR) %>%
  unite(SAMPLELABEL, VENT, SITE, SAMPLETYPE, YEAR, sep = "_", remove = FALSE) %>% 
  unite(TYPE_SITE, SITE, SAMPLETYPE, sep = "_", remove = FALSE)
# View(metadata_insitu)
# Import asv and tax matrices
ASV = otu_table(insitu_asv_matrix, taxa_are_rows = TRUE)
TAX = tax_table(insitu_tax_matrix)
phylo_obj <- phyloseq(ASV, TAX)


# Import metadata as sample data in phyloseq
samplenames <- sample_data(metadata_insitu)
# View(samplenames)

# join as phyloseq object
physeq_insitu = merge_phyloseq(phylo_obj, samplenames)

## Check
physeq_insitu
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2537 taxa and 44 samples ]
## sample_data() Sample Data:       [ 44 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 2537 taxa by 8 taxonomic ranks ]
# head(tax_matrix[1:2,])

Note options to apply divnet to

# Glom tax levels at the Order level, then perform divnet analysis
# order_divnet <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), base = 3)
# order_divnet <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), X = "SAMPLELABEL", base = 3)
# order_divnet_TYPE <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), X = "SAMPLETYPE", base = 3)
# order_divnet_TYPE_SITE <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), X = "TYPE_SITE", base = 3)

# fam_divnet <- divnet(tax_glom(physeq_insitu, taxrank = "Family"), base = 3)
# head(metadata_insitu)
  
# fxn_extract_divet <- function(df){
#   df$shannon %>% summary %>% 
#   pivot_longer(cols = starts_with("estimate"), names_to = "ESTIMATE-shannon", values_to = "Shannon") %>% 
#   pivot_longer(cols = starts_with("error"), names_to = "ERROR-shannon", values_to = "Shannon-error") %>%
#   pivot_longer(cols = starts_with("lower"), names_to = "LOWER-shannon", values_to = "Shannon-lower") %>%
#   pivot_longer(cols = starts_with("upper"), names_to = "UPPER-shannon", values_to = "Shannon-upper") %>%
#   left_join(df$simpson %>% summary %>% 
#       pivot_longer(cols = starts_with("estimate"), names_to = "ESTIMATE-simpson", values_to = "Simpson") %>% 
#       pivot_longer(cols = starts_with("error"), names_to = "ERROR-simpson", values_to = "Simpson-error") %>%
#       pivot_longer(cols = starts_with("lower"), names_to = "LOWER-simpson", values_to = "Simpson-lower") %>%
#       pivot_longer(cols = starts_with("upper"), names_to = "UPPER-simpson", values_to = "Simpson-upper"), 
#     by = c("sample_names" = "sample_names")) %>% 
#   left_join(metadata_insitu %>% rownames_to_column(var = "sample_names")) %>% 
#   select(-sample_names, -ends_with("-simpson"), -ends_with("-shannon"), -starts_with("model."), -starts_with("name.")) %>% 
#   distinct()
# }
# 
# order_alpha_18s <- fxn_extract_divet(order_divnet)
# order_alpha_TYPE <- fxn_extract_divet(order_divnet_TYPE)
# order_alpha_TYPE_SITE <- fxn_extract_divet(order_divnet_TYPE_SITE)

4.1 Plot species richness for different sample types

4.1.1 All samples

# head(order_alpha_18s)

# plot_grid(order_alpha_18s %>% 
#   # ggplot(aes(x = VENT, y = Shannon)) +
#   ggplot(aes(x = SAMPLETYPE, y = Shannon, group = SAMPLETYPE)) +
#   # geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
#     # geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#   # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
#     geom_violin(aes(fill = SAMPLETYPE), color = "#525252", alpha = 0.5, width = 0.5, draw_quantiles = c(0.25, 0.5, 0.75)) +   
#     geom_jitter(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#     scale_fill_brewer(palette = "Set2") +
#   theme_linedraw() +
#   theme(axis.text.y = element_text(size = 14),
#         axis.text.x = element_blank(),
#         strip.background = element_blank(),
#         strip.text = element_text(color = "black"),
#         legend.position = "none",
#         axis.ticks.x = element_blank()) +
#   labs(x = "", y = "Shannon"),
#   order_alpha_18s %>% 
#   # ggplot(aes(x = VENT, y = Simpson)) +
#     ggplot(aes(x = SAMPLETYPE, y = Shannon, group = SAMPLETYPE)) +
#   # geom_errorbar(aes(ymin = `Simpson-lower`, ymax = `Simpson-upper`), color = "#525252", width = 0.2) +
#     # geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#   # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
#     geom_violin(aes(fill = SAMPLETYPE), color = "#525252", alpha = 0.5, width = 0.5, draw_quantiles = c(0.25, 0.5, 0.75)) +   
#     geom_jitter(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#     scale_fill_brewer(palette = "Set2") +
#   theme_linedraw() +
#   theme(axis.text.x = element_text(vjust = 1, hjust = 0.5, size = 14),
#         axis.text = element_text(size = 14),
#         strip.background = element_blank(),
#         strip.text = element_blank(),
#         legend.title = element_blank(),
#         legend.position = "bottom") +
#   labs(x = "", y = "Simpson"),
#   ncol = 1, axis = c("lrt"), align = c("vh"))

Save for presentation

# svg("Shannon-violin-plot.svg",)
# order_alpha_18s %>% 
#   # ggplot(aes(x = VENT, y = Shannon)) +
#   ggplot(aes(x = SAMPLETYPE, y = Shannon, group = SAMPLETYPE)) +
#   # geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
#     # geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#   # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
#     geom_violin(aes(fill = SAMPLETYPE), color = "#525252", alpha = 0.5, width = 0.5, draw_quantiles = c(0.25, 0.5, 0.75)) +   
#     geom_jitter(shape = 21, color = "#525252", size = 3, aes(fill = SAMPLETYPE)) +
#     # scale_fill_brewer(palette = "Set2") +
#   scale_fill_manual(values = c("#1c9099", "#fd8d3c", "#f768a1")) + 
#   theme_linedraw() +
#   theme(axis.text.x = element_text(vjust = 1, hjust = 0.5, size = 14),
#         axis.text = element_text(size = 14),
#         strip.background = element_blank(),
#         strip.text = element_blank(),
#         legend.title = element_blank(),
#         legend.position = "bottom") +
#   labs(x = "", y = "Shannon")
# dev.off()
# head(order_alpha_TYPE)
# plot_grid(order_alpha_TYPE %>% 
#     select(-SAMPLELABEL, -VENT, -SITE, -TYPE_SITE, -YEAR) %>% 
#     distinct() %>% 
#   ggplot(aes(x = SAMPLETYPE, y = Shannon)) +
#   geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
#     geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#   # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
#     scale_fill_brewer(palette = "Set2") +
#   theme_linedraw() +
#   theme(axis.text.x = element_blank(),
#         strip.background = element_blank(),
#         strip.text = element_text(color = "black"),
#         legend.position = "none",
#         axis.ticks = element_blank()) +
#   labs(x = "", y = "Shannon"),
#   order_alpha_TYPE %>% 
#     select(-SAMPLELABEL, -VENT, -SITE, -TYPE_SITE, -YEAR) %>% 
#     distinct() %>% 
#   ggplot(aes(x = SAMPLETYPE, y = Simpson)) +
#   geom_errorbar(aes(ymin = `Simpson-lower`, ymax = `Simpson-upper`), color = "#525252", width = 0.2) +
#     geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#   # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
#     scale_fill_brewer(palette = "Set2") +
#   theme_linedraw() +
#   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
#         strip.background = element_blank(),
#         strip.text = element_blank(),
#         legend.title = element_blank(),
#         legend.position = "bottom") +
#   labs(x = "", y = "Simpson"),
#   ncol = 1, axis = c("lr"), align = c("v"))
# head(order_alpha_TYPE_SITE)
# plot_grid(order_alpha_TYPE_SITE %>% 
#     select(-SAMPLELABEL, -YEAR, -VENT) %>% 
#     distinct() %>% 
#   ggplot(aes(x = SAMPLETYPE, y = Shannon)) +
#   geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
#     geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#   facet_grid(. ~ SITE, space = "free_x", scales = "free_x") +
#     scale_fill_brewer(palette = "Set2") +
#   theme_linedraw() +
#   theme(axis.text.x = element_blank(),
#         strip.background = element_blank(),
#         strip.text = element_text(color = "black"),
#         legend.position = "none",
#         axis.ticks = element_blank()) +
#   labs(x = "", y = "Shannon"),
#   order_alpha_TYPE_SITE %>% 
#     select(-SAMPLELABEL, -YEAR, -VENT) %>% 
#     distinct() %>% 
#   ggplot(aes(x = SAMPLETYPE, y = Simpson)) +
#   geom_errorbar(aes(ymin = `Simpson-lower`, ymax = `Simpson-upper`), color = "#525252", width = 0.2) +
#     geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
#   facet_grid(. ~ SITE, space = "free_x", scales = "free_x") +
#     scale_fill_brewer(palette = "Set2") +
#   theme_linedraw() +
#   theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
#         strip.background = element_blank(),
#         strip.text = element_blank(),
#         legend.title = element_blank(),
#         legend.position = "bottom") +
#   labs(x = "", y = "Simpson"),
#   ncol = 1, axis = c("lr"), align = c("v"))

5 Assess ASV presence/absence

Set up analysis to classify each ASV

head(asv_insitu)
## # A tibble: 6 × 35
##   FeatureID SAMPLE value Taxon Domain Supergroup Phylum Class Order Family Genus
##   <chr>     <chr>  <dbl> <chr> <chr>  <chr>      <chr>  <chr> <chr> <chr>  <chr>
## 1 0016c920… 70_MC…    45 Euka… Eukar… Rhizaria   Cerco… Chlo… Chlo… Chlor… Loth…
## 2 004b2336… 77_MC…    21 Euka… Eukar… Alveolata  Dinof… Dino… Dino… Dinop… Dino…
## 3 006f5664… 66_MC…   315 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 4 006f5664… 71_MC…   181 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 5 006f5664… 77_MC…    89 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 6 006f5664… 78_MC…    52 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## # … with 24 more variables: Species <chr>, Consensus <dbl>, SAMPLENAME <chr>,
## #   VENT <chr>, COORDINATES <chr>, SITE <chr>, Sample_or_Control <chr>,
## #   SAMPLEID <chr>, DEPTH <chr>, SAMPLETYPE <chr>, YEAR <int>, TEMP <dbl>,
## #   pH <dbl>, PercSeawater <dbl>, Mg <dbl>, NO3 <dbl>, H2 <dbl>, H2S <dbl>,
## #   CH4 <dbl>, ProkConc <dbl>, Sample_actual <chr>, Type <chr>, DATASET <chr>,
## #   DECONTAM <chr>
# head(insitu_wide)
# unique(asv_insitu$SAMPLETYPE)
# unique(asv_insitu$SITE)

tax_asv_id <- asv_insitu %>% 
  filter(value > 0) %>% #remove zero values
  select(FeatureID, SITE, SAMPLETYPE) %>% # isolate only ASVs that are PRESENT at a site and sampletype
  distinct() %>% #unique this, as presense = present in at least 1 rep (where applicable)
  unite(sample_id, SITE, SAMPLETYPE, sep = "_") %>% 
  # select(-SITE) %>% 
  # distinct() %>% 
  add_column(present = 1) %>%
  pivot_wider(names_from = sample_id, values_from = present, values_fill = 0) %>% 
  rowwise() %>% 
  mutate_at(vars(FeatureID), factor)
#   
#   mutate(total = sum(c_across(where(!is.na(.))))) %>%
#   mutate(FREQ = case_when(
#     total == 1 ~ "Appears in only 1 sample type",
#     total > 1 ~ "Appears in more than 1 location or sample type"
#   ))
# head(tax_asv_id)
library(purrr)
any_cols <- function(tax_asv_id) reduce(tax_asv_id, `|`)

asv_class <- tax_asv_id %>%
  mutate(vent = ifelse(any_cols(across(contains("_Vent"), ~ . > 0)), "VENT", ""),
         plume= ifelse(any_cols(across(contains("_Plume"), ~ . > 0)), "PLUME", ""),
         bsw = ifelse(any_cols(across(contains("_Background"), ~ . > 0)), "BSW", ""),
         ) %>% 
  unite(class_tmp, vent, plume, bsw, sep = "_", na.rm = TRUE) %>% 
  mutate(CLASS = case_when(
  class_tmp == "VENT__" ~ "Vent only",
  class_tmp == "_PLUME_" ~ "Plume only",
  class_tmp == "__BSW" ~ "Background only",
  class_tmp == "VENT__BSW" ~ "Vent & background",
  class_tmp == "VENT_PLUME_BSW" ~ "Vent, plume, & background",
  class_tmp == "VENT_PLUME_" ~ "Vent & plume",
  class_tmp == "_PLUME_BSW" ~ "Plume & background"
  )) %>% 
  select(FeatureID, CLASS) %>% distinct()

colnames(tax_asv_id)
##  [1] "FeatureID"             "VonDamm_Vent"          "Piccard_Vent"         
##  [4] "Piccard_Background"    "VonDamm_Plume"         "Axial_Vent"           
##  [7] "VonDamm_Background"    "Piccard_Plume"         "GordaRidge_Vent"      
## [10] "GordaRidge_Background" "GordaRidge_Plume"      "Axial_Background"     
## [13] "Axial_Plume"
head(tax_asv_id)
## # A tibble: 6 × 13
## # Rowwise: 
##   FeatureID  VonDamm_Vent Piccard_Vent Piccard_Backgro… VonDamm_Plume Axial_Vent
##   <fct>             <dbl>        <dbl>            <dbl>         <dbl>      <dbl>
## 1 0016c9209…            1            0                0             0          0
## 2 004b23363…            0            1                0             0          0
## 3 006f5664d…            1            1                0             0          0
## 4 007322d9b…            0            1                0             0          0
## 5 008783681…            1            1                0             0          0
## 6 0090723b7…            1            0                0             0          0
## # … with 7 more variables: VonDamm_Background <dbl>, Piccard_Plume <dbl>,
## #   GordaRidge_Vent <dbl>, GordaRidge_Background <dbl>, GordaRidge_Plume <dbl>,
## #   Axial_Background <dbl>, Axial_Plume <dbl>
asv_class_SITE <- tax_asv_id %>%
  mutate(mcr = ifelse(any_cols(across(contains("Piccard") | contains("VonDamm"), ~ . > 0)), "MCR", ""),
         axial = ifelse(any_cols(across(contains("Axial"), ~ . > 0)), "AxS", ""),
         gr = ifelse(any_cols(across(contains("Gorda"), ~ . > 0)), "GR", "")
         ) %>% 
  unite(class_tmp, mcr, axial, gr, sep = "_", na.rm = TRUE) %>% 
  mutate(SITE_CLASS = case_when(
  class_tmp == "__GR" ~ "Gorda Ridge only",
  class_tmp == "_AxS_" ~ "Axial only",
  class_tmp == "_AxS_GR" ~ "Axial & Gorda Ridge",
  class_tmp == "MCR__" ~ "Mid-Cayman Rise",
  class_tmp == "MCR__GR" ~ "Mid-Cayman Rise & Gorda Ridge",
  class_tmp == "MCR_AxS_" ~ "Mid-Cayman Rise & Axial",
  class_tmp == "MCR_AxS_GR" ~ "All sites"
  )) %>% 
  select(FeatureID, SITE_CLASS) %>% distinct()
insitu_asv_wClass <- asv_insitu %>% 
  left_join(asv_class) %>% 
  left_join(asv_class_SITE)
## Joining, by = "FeatureID"
## Joining, by = "FeatureID"
# head(insitu_asv_wClass)

Bubble plot

# head(asv_insitu)
# svg("bubbles.svg", h = 4, w = 8)
asv_insitu %>% 
  select(DATASET, FeatureID, SAMPLETYPE) %>% 
  group_by(DATASET, SAMPLETYPE) %>% 
    summarise(COUNT = n_distinct(FeatureID)) %>% 
  ggplot(aes(x = DATASET, y = SAMPLETYPE, fill = SAMPLETYPE)) +
    geom_point(aes(size = COUNT), shape = 21, color = "black") +
    scale_size_continuous(range = c(4,20)) +
  scale_fill_viridis_d(option = "B") +
  theme_void() +
  theme(legend.position = "right",
        axis.text = element_text(color = "black"))
## `summarise()` has grouped output by 'DATASET'. You can override using the `.groups` argument.

# dev.off()
unique(insitu_asv_wClass$CLASS)
## [1] "Vent only"                 "Vent & background"        
## [3] "Plume only"                "Background only"          
## [5] "Vent, plume, & background" "Vent & plume"             
## [7] "Plume & background"
unique(insitu_asv_wClass$SITE_CLASS)
## [1] "Mid-Cayman Rise"               "Axial only"                   
## [3] "Gorda Ridge only"              "Mid-Cayman Rise & Gorda Ridge"
## [5] "Axial & Gorda Ridge"           "Mid-Cayman Rise & Axial"      
## [7] "All sites"

6 Functions to explore community diversity

To explore microbial eukaryotic community diversity at all three sites, below functions have been written to pass 18S data for each site through the same analysis. This will be done for all sites together and for them individually.

Sections below highlight Axial Seamount, Mid-Cayman Rise, and Gorda Ridge data individually.

axial <- c("Axial")
mcr <- c("VonDamm", "Piccard")
gr <- c("GordaRidge")

6.1 Bar plot function

Create a bar plot showing the relative sequence abundance of 18S results to the Supergroup and Phylum level. Function averages across replicates and then sums to the phylum and supergroup level. Bar plot shows the relative sequence abundance.

make_bar_relabun <- function(df, selection){
  df_out <- df %>% 
  filter(SITE %in% selection) %>% 
  filter(Domain == "Eukaryota") %>%
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup()
  supergroup <- df_out %>% 
  group_by(SITE, SAMPLETYPE, VENT, YEAR, Supergroup) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  ggplot(aes(x = VENT, y = SEQ_SUM, fill = Supergroup)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
    facet_grid(. ~ SITE +YEAR + SAMPLETYPE, scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "right") +
  scale_y_continuous(expand = c(0,0)) +
  # scale_fill_brewer(palette = "Set2") +
  scale_fill_manual(values = c("#fa9fb5", "#c51b8a", "#67000d", "#ef3b2c", "#ffffcc", "#feb24c", "#c7e9b4", "#1d91c0", "#253494", "#9e9ac8", "#238b45", "#54278f", "#bdbdbd", "#252525")) +
  labs(x = "", y = "Relative abundance")
  
  phylum <- df_out %>% 
    unite(SupergroupPhylum, Supergroup, Phylum, sep = "-") %>% 
  group_by(SITE, SAMPLETYPE, VENT, YEAR, SupergroupPhylum) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  ggplot(aes(x = VENT, y = SEQ_SUM, fill = SupergroupPhylum)) +
    geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
    facet_grid(. ~ SITE +YEAR + SAMPLETYPE, scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        strip.background = element_blank(), strip.text = element_text(color = "black"),
        legend.position = "right") +
  scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
    "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
    "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
    "#807dba", "#54278f", "#bdbdbd", "black", "white", "#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
    "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45")) +
  labs(x = "", y = "Relative abundance")
  supergroup + phylum + patchwork::plot_layout(ncol = 1)
}

# make_bar_relabun(insitu_asv_wClass, axial)

6.2 Tile plot

Relative abundance plots are misleading, as this tag-sequence data is compositional. To combat this, we can also perform a center log-ratio transformation of the sequence counts. This tile plot (or heat map) will show the relationship from the data mean. Positive values thus demonstrate an increase in the taxa, while negative values illustrate the opposite.

# Not sure if I need this
tax_key <- insitu_asv_wClass %>% 
  select(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species, CLASS, SITE_CLASS) %>% 
  distinct()

Ahead of the CLR transformation, average across replicates, then sum to the Class level. THEN perform CLR transformation and plot as heat map.

make_clr_trans_tile <- function(df, selection){
  df_wide <- df %>%
    filter(SITE %in% selection) %>%
  # df_wide <- insitu_asv_wClass %>% 
    # filter(SITE %in% axial) %>% 
    filter(Domain == "Eukaryota") %>% 
  # Average across replicates
      group_by(FeatureID, SAMPLENAME, VENT, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species) %>% 
        summarise(AVG = mean(value)) %>% 
      ungroup() %>% 
    # Sum to the Order taxonomic classification
    unite(SAMPLENAME_2, SAMPLENAME, VENT, sep = "_") %>% 
      group_by(SAMPLENAME_2, Supergroup, Phylum, Class) %>% 
        summarise(CLASS_SUM = sum(AVG)) %>% 
    unite(CLASS, Supergroup, Phylum, Class, sep = " ") %>% 
    select(CLASS, SAMPLENAME_2, CLASS_SUM) %>% 
    pivot_wider(names_from = SAMPLENAME_2, values_from = CLASS_SUM, values_fill = 0) %>% 
    column_to_rownames(var = "CLASS")
  ## Take wide data frame and CLR transform, pivot to wide, and plot
  data.frame(compositions::clr(df_wide)) %>% 
      rownames_to_column(var = "CLASS") %>%
      pivot_longer(cols = starts_with(selection), values_to = "CLR", names_to = "SAMPLENAME_2") %>% 
      separate(SAMPLENAME_2, c("SAMPLENAME", "VENT"), sep = "_") %>% 
      separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>% 
      mutate(VENT = str_replace_all(VENT, "\\.", " ")) %>% 
      mutate(REGION = case_when(
        SITE == "GordaRidge" ~ "Gorda Ridge",
        SITE %in% mcr ~ "Mid-Cayman Rise",
        SITE == "Axial" ~ "Axial")) %>% 
      mutate(VENTNAME = case_when(
        REGION == "Gorda Ridge" ~ VENT,
        REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
        REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
      )) %>% select(-Sample_tmp) %>% 
    unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>% 
    separate(CLASS, c("Supergroup", "Phylum", "Class"), sep = " ", remove = FALSE) %>%
    ggplot(aes(x = SAMPLE, y = Class, fill = CLR)) +
      geom_tile(color = "#252525") + 
      theme(legend.position = "right", 
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), 
            panel.border = element_blank(), 
            panel.background = element_blank(),
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, color = "black",size = 8), 
            axis.text.y = element_text(color = "black", size = 8),
            strip.background = element_blank(), 
            strip.text.y = element_text(hjust = 0, vjust = 0.5, angle = 0),
            # strip.text.x = element_blank(), 
            legend.title = element_blank()) + 
      labs(x = "", y = "") +
      scale_fill_gradient2(low = "#4575b4", mid = "white", high = "#d73027", na.value = "grey50") +
      facet_grid(Supergroup + Phylum ~ SAMPLETYPE, space = "free", scales = "free")
}

6.3 PCA

Similar to aove, the first step in this function transforms data using CLR (to ASV level though). First plot will show eigen values (scree plot to determine if 2 vs. 3 dimensions is best for data). Then function extracts data points and creates PCA plot.

make_pca <- function(df, selection){
 df_wide_asv <- df %>%
    # df_wide_asv <- insitu_asv_wClass %>%
  filter(SITE %in% selection) %>%
    # filter(SITE %in% axial) %>%
  filter(Domain == "Eukaryota") %>% 
  # Average across replicates
    group_by(FeatureID, SAMPLENAME, VENT) %>% 
    summarise(AVG = mean(value)) %>% 
    ungroup() %>% 
  separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>% 
  mutate(REGION = case_when(
    SITE == "GordaRidge" ~ "Gorda Ridge",
    SITE %in% mcr ~ "Mid-Cayman Rise",
    SITE == "Axial" ~ "Axial")) %>% 
  mutate(VENTNAME = case_when(
    REGION == "Gorda Ridge" ~ VENT,
    REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
    REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
  )) %>% select(-Sample_tmp) %>% 
  unite(SAMPLE, SAMPLETYPE, REGION, VENTNAME, sep = "_", remove = FALSE) %>% 
   group_by(FeatureID, SAMPLE) %>% 
   summarise(SUM = sum(AVG)) %>% 
  pivot_wider(names_from = FeatureID, values_from = SUM, values_fill = 0) %>% 
  column_to_rownames(var = "SAMPLE")
  # look at eigenvalues
  pca_lr <- prcomp(data.frame(compositions::clr(df_wide_asv)))
  variance_lr <- (pca_lr$sdev^2)/sum(pca_lr$sdev^2)
  ## View bar plot
  barplot(variance_lr, main = "Log-Ratio PCA Screeplot", xlab = "PC Axis", ylab = "% Variance", 
    cex.names = 1.5, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
  ## Extract PCR points
  data.frame(pca_lr$x, SAMPLE = rownames(pca_lr$x)) %>% 
      separate(SAMPLE, c("SAMPLETYPE", "REGION", "VENTNAME"), sep = "_", remove = FALSE) %>% 
  ## Generate PCA plot
  ggplot(aes(x = PC1, y = PC2, shape = SAMPLETYPE, fill = VENTNAME)) +
    geom_hline(yintercept = 0) + geom_vline(xintercept = 0, color = "#525252") +
    geom_point(size=4, stroke = 1, aes(fill = VENTNAME)) +
    ylab(paste0('PC2 ',round(variance_lr[2]*100,2),'%')) +
    xlab(paste0('PC1 ',round(variance_lr[1]*100,2),'%')) +
    scale_shape_manual(values = c(21, 23, 24)) +
    # scale_fill_manual(values = fill_color) +
    # scale_color_manual(values = color_color) +
    theme_bw() +
    theme(axis.text = element_text(color="black", size=12),
          legend.title = element_blank(),
          axis.title = element_text(color="black", size=14),
          legend.text = element_text(color = "black", size = 14),
          plot.margin = margin(2, 1, 2, 1, "cm")) +
    guides(fill = guide_legend(override.aes = list(shape = 21) ),
              shape = guide_legend(override.aes = list(fill = "black")))
  }

6.4 ASV richness

From complete dataset, average across replicates, then sum the total number of ASVs in each sample. Then plot a data point for total number of ASVs (ASV richness) by sample type - where sample type represents the vent, plume, vs. background. Box plots show the median and range.

make_asv_rich <- function(df, selection){
  df %>% 
  filter(SITE %in% selection) %>%
  filter(Domain == "Eukaryota") %>% 
  # Average across replicates
    group_by(FeatureID, SAMPLENAME, VENT, Supergroup) %>% 
    summarise(AVG = mean(value)) %>% 
  separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>% 
  mutate(REGION = case_when(
    SITE == "GordaRidge" ~ "Gorda Ridge",
    SITE %in% mcr ~ "Mid-Cayman Rise",
    SITE == "Axial" ~ "Axial")) %>% 
  mutate(VENTNAME = case_when(
    REGION == "Gorda Ridge" ~ VENT,
    REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
    REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
  )) %>% select(-Sample_tmp) %>% 
  unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>% 
  ungroup() %>% 
  group_by(REGION, SAMPLE, SAMPLETYPE) %>% 
    summarise(NUM_ASV = n()) %>% 
  ggplot(aes(x = SAMPLETYPE, y = NUM_ASV, shape = SAMPLETYPE)) +
  geom_boxplot(aes(group = SAMPLETYPE), alpha = 0.8, width = 0.4) +
  geom_jitter(size=2, aes(fill = REGION)) +
  scale_shape_manual(values = c(21, 23, 24)) +
  theme_bw() +
  theme(axis.text = element_text(color="black", size=12),
        legend.title = element_blank(),
        axis.title = element_text(color="black", size=14),
        legend.text = element_text(color = "black", size = 14)) +
  guides(fill = guide_legend(override.aes = list(shape = 21) ),
            shape = guide_legend(override.aes = list(fill = "black") ) ) +
  labs(x = "", y = "Total number of ASVs")
}

6.5 Presence-absence

Bar plot (colors correspond to Supergroup) represents the number of ASVs shared or unique to each sample. Combination matrix below bars shows which samples are considered for the bar plot.

make_upset_plot <- function(df, selection){ 
  df %>% 
  filter(SITE %in% selection) %>%
  filter(Domain == "Eukaryota") %>% 
  # Average across replicates
    group_by(FeatureID, SAMPLENAME, VENT, Supergroup) %>% 
    summarise(AVG = mean(value)) %>% 
  separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>% 
  mutate(REGION = case_when(
    SITE == "GordaRidge" ~ "Gorda Ridge",
    SITE %in% mcr ~ "Mid-Cayman Rise",
    SITE == "Axial" ~ "Axial")) %>% 
  mutate(VENTNAME = case_when(
    REGION == "Gorda Ridge" ~ VENT,
    REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
    REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
  )) %>% select(-Sample_tmp) %>% 
  unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>% 
  distinct(FeatureID, Supergroup, AVG, SAMPLE, .keep_all = TRUE) %>% 
  group_by(FeatureID, Supergroup) %>% 
  summarise(SAMPLE = list(SAMPLE)) %>% 
  ggplot(aes(x = SAMPLE)) +
    geom_bar(color = "black", width = 0.5, aes(fill = Supergroup)) +
    scale_x_upset(n_intersections = 25) +
  scale_y_continuous(expand = c(0,0)) +
  labs(x = "", y = "Shared ASVs") +
  theme_linedraw() +
  theme(axis.text = element_text(color="black", size=10),
        axis.title = element_text(color="black", size=10),
        legend.text = element_text(color = "black", size = 10),
        plot.margin = margin(1, 1, 1, 5, "cm")) +
    scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c", 
    "#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45", 
    "#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc", 
    "#807dba", "#54278f", "#bdbdbd", "black"))
}

7 Axial Seamount analysis

7.1 Bar plot relative abundance: Axial

make_bar_relabun(insitu_asv_wClass, axial)
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.

Axial Seamount samples from archived material - span 2013, 2014, and 2015. First, the background and plume (from 2015 only, and from plume associated with the Anemone vent) are different from the vent samples - overwhelmingly stramneopile and rhizaria. For the background and plume, the stramenopiles appear to be associated with ochrophyta or opalozoa. For the plume, the rhizaria population was associated with cercozoa, while the background seawater was identified as belonging to radiolaria.

The major difference between the background/plume and vent sites was the higher relative sequence abundance of ciliates and opisthokonta. For the opisthokonta, these are primarily metazoa - and I will need to investigate this further. Exceptions for this include the ‘Dependable’ vent from 2013, which had a completely different composition, and ‘Marker 113’ in 2015, which the opisthokonta sequences were assigned choanoflagellate and fungi.

Further questions to consider

Any geochemical changes to Marker 113 from 2013/2014 to 2015? Could attribute difference of opisthokonta colonization.

Need to get metadata

Some of the Axial data had low sequence totals - is the Dependable vent one of these samples?

Yes it is - probably need to remove this sample! ## Tile plot CLR: Axial

make_clr_trans_tile(insitu_asv_wClass, axial)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 770 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

7.2 PCA: Axial

make_pca(insitu_asv_wClass, axial)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 439 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.

7.3 ASV richness: Axial

make_asv_rich(insitu_asv_wClass, axial)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 439 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.

We only have 1 sample for background and plume from Axial Seamount. But this shows that the vent sites have varied ASV richness,

7.4 Presence-absence: Axial

make_upset_plot(insitu_asv_wClass, axial)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 439 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 35 rows containing non-finite values (stat_count).

8 Mid-Cayman Rise

8.1 Bar plot relative abundance: MCR

make_bar_relabun(insitu_asv_wClass, mcr)
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.

8.2 Tile plot CLR: MCR

make_clr_trans_tile(insitu_asv_wClass, mcr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 1378 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

8.3 PCA: MCR

make_pca(insitu_asv_wClass, mcr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 3809 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.

8.4 ASV richness: MCR

make_asv_rich(insitu_asv_wClass, mcr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 3809 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.

8.5 Presence-absence: MCR

make_upset_plot(insitu_asv_wClass, mcr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 3809 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 498 rows containing non-finite values (stat_count).

9 Gorda Ridge

9.1 Bar plot relative abundance: GR

make_bar_relabun(insitu_asv_wClass, gr)
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.

9.2 Tile plot CLR: GR

make_clr_trans_tile(insitu_asv_wClass, gr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 918 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

9.3 PCA: GR

make_pca(insitu_asv_wClass, gr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 401 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.

9.4 ASV richness: GR

make_asv_rich(insitu_asv_wClass, gr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 401 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.

9.5 Presence-absence: GR

make_upset_plot(insitu_asv_wClass, gr)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 401 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 16 rows containing non-finite values (stat_count).

10 Compare all three sites

all <- c("Axial", "VonDamm", "Piccard", "GordaRidge")
mcr <- c("VonDamm", "Piccard")

10.1 Bar plot relative abundance: all

make_bar_relabun(insitu_asv_wClass, all)
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.

10.2 Tile plot CLR: all

make_clr_trans_tile(insitu_asv_wClass, all)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 5016 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

10.3 PCA: all

make_pca(insitu_asv_wClass, all)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.

10.4 ASV richness: all

make_asv_rich(insitu_asv_wClass, all)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.

10.5 Presence-absence: all

make_upset_plot(insitu_asv_wClass, all)
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 787 rows containing non-finite values (stat_count).

Isolate shared and unique for some combinations, but only a few - as key examples! at the ASV level.

11 Compare resident vs. cosmopolitan taxa

# head(insitu_asv_wClass) # from above, where I've classified each ASV by site and occurence in sample type
unique(insitu_asv_wClass$CLASS)
## [1] "Vent only"                 "Vent & background"        
## [3] "Plume only"                "Background only"          
## [5] "Vent, plume, & background" "Vent & plume"             
## [7] "Plume & background"
unique(insitu_asv_wClass$SITE_CLASS)
## [1] "Mid-Cayman Rise"               "Axial only"                   
## [3] "Gorda Ridge only"              "Mid-Cayman Rise & Gorda Ridge"
## [5] "Axial & Gorda Ridge"           "Mid-Cayman Rise & Axial"      
## [7] "All sites"
unique(insitu_asv_wClass$SAMPLETYPE)
## [1] "Vent"       "Background" "Plume"

11.1 Isolate shared ASVs among all

all_vents <- insitu_asv_wClass %>% 
  filter(SITE_CLASS == "All sites" & grepl("Vent", CLASS)) %>% 
  filter(SAMPLETYPE == "Vent")
# View(all_vents)
# head(insitu_asv_wClass)

Presence/absence of ASVs to the Class level across all vents.

# svg("pa-ventsites.svg", w=8, h=12)
insitu_asv_wClass %>% 
  filter(grepl("Vent", CLASS)) %>% 
  filter(Domain == "Eukaryota") %>% 
  filter(SAMPLETYPE == "Vent") %>% 
  filter(!is.na(Class)) %>% 
  filter(Phylum != "Stramenopiles_X" & Phylum != "Pseudofungi" & Phylum != "Chrompodellids") %>%
  filter(Supergroup != "Opisthokonta") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  mutate(SITE_2 = case_when(
    SITE == "Piccard" ~ "MCR",
    SITE == "VonDamm" ~ "MCR",
    TRUE ~ SITE
  )) %>% 
  group_by(SITE_2, SAMPLETYPE, VENT, Supergroup, Class) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  ggplot(aes(x = VENT, y = Class, fill = SEQ_SUM)) +
    geom_tile(stat = "identity", color = "white", width = 0.9, fill = "black") +
    facet_grid(Supergroup ~ SITE_2 + SAMPLETYPE , scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
        strip.background = element_blank(), strip.text.x = element_text(color = "black", angle = 0),
        panel.grid = element_blank(), strip.text.y = element_text(color = "black", angle = 0))
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE_2', 'SAMPLETYPE', 'VENT', 'Supergroup'. You can override using the `.groups` argument.

# dev.off()

What is the vent signature across the different sites?

Repeat above plot, isolate ciliate, dinos, rhizaria, and stramenopiles.

phyla <- c("Alveolata-Ciliophora", "Alveolata-Dinoflagellata", "Rhizaria", "Stramenopiles")

# svg("pa-ventsites-selected.svg", w=8, h=10)
insitu_asv_wClass %>% 
  filter(grepl("Vent", CLASS)) %>% 
  filter(Domain == "Eukaryota") %>% 
  filter(SAMPLETYPE == "Vent") %>% 
  filter(!is.na(Class)) %>% 
  filter(Phylum != "Stramenopiles_X" & Phylum != "Pseudofungi" & Phylum != "Chrompodellids") %>%
  filter(Supergroup != "Opisthokonta") %>% 
  mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
         Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
         Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
         Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>% 
  filter(Supergroup %in% phyla) %>% 
  group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
           VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>% 
    summarise(SEQ_AVG_REP = mean(value)) %>% 
  ungroup() %>% 
  mutate(SITE_2 = case_when(
    SITE == "Piccard" ~ "MCR",
    SITE == "VonDamm" ~ "MCR",
    TRUE ~ SITE
  )) %>% 
  group_by(SITE_2, SAMPLETYPE, VENT, Supergroup, Class) %>% 
    summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>% 
  ggplot(aes(x = VENT, y = Class, fill = SEQ_SUM)) +
    geom_tile(stat = "identity", color = "white", width = 0.9, fill = "black") +
    facet_grid(Supergroup ~ SITE_2 + SAMPLETYPE , scale = "free", space = "free") +
  theme_linedraw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
        strip.background = element_blank(), strip.text.x = element_text(color = "black", angle = 0),
        panel.grid = element_blank(), strip.text.y = element_text(color = "black", angle = 0))
## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE_2', 'SAMPLETYPE', 'VENT', 'Supergroup'. You can override using the `.groups` argument.

# dev.off()

11.2 Among resident ASVs, what is distribution?

head(insitu_asv_wClass)
## # A tibble: 6 × 37
##   FeatureID SAMPLE value Taxon Domain Supergroup Phylum Class Order Family Genus
##   <chr>     <chr>  <dbl> <chr> <chr>  <chr>      <chr>  <chr> <chr> <chr>  <chr>
## 1 0016c920… 70_MC…    45 Euka… Eukar… Rhizaria   Cerco… Chlo… Chlo… Chlor… Loth…
## 2 004b2336… 77_MC…    21 Euka… Eukar… Alveolata  Dinof… Dino… Dino… Dinop… Dino…
## 3 006f5664… 66_MC…   315 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 4 006f5664… 71_MC…   181 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 5 006f5664… 77_MC…    89 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 6 006f5664… 78_MC…    52 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## # … with 26 more variables: Species <chr>, Consensus <dbl>, SAMPLENAME <chr>,
## #   VENT <chr>, COORDINATES <chr>, SITE <chr>, Sample_or_Control <chr>,
## #   SAMPLEID <chr>, DEPTH <chr>, SAMPLETYPE <chr>, YEAR <int>, TEMP <dbl>,
## #   pH <dbl>, PercSeawater <dbl>, Mg <dbl>, NO3 <dbl>, H2 <dbl>, H2S <dbl>,
## #   CH4 <dbl>, ProkConc <dbl>, Sample_actual <chr>, Type <chr>, DATASET <chr>,
## #   DECONTAM <chr>, CLASS <chr>, SITE_CLASS <chr>
unique(insitu_asv_wClass$CLASS)
## [1] "Vent only"                 "Vent & background"        
## [3] "Plume only"                "Background only"          
## [5] "Vent, plume, & background" "Vent & plume"             
## [7] "Plume & background"
unique(insitu_asv_wClass$SITE_CLASS)
## [1] "Mid-Cayman Rise"               "Axial only"                   
## [3] "Gorda Ridge only"              "Mid-Cayman Rise & Gorda Ridge"
## [5] "Axial & Gorda Ridge"           "Mid-Cayman Rise & Axial"      
## [7] "All sites"
unique(insitu_asv_wClass$SAMPLETYPE)
## [1] "Vent"       "Background" "Plume"
insitu_asv_wClass %>% 
  # filter(SITE %in% selection) %>%
  filter(Domain == "Eukaryota") %>% 
  # Average across replicates
    group_by(FeatureID, SAMPLENAME, VENT, Supergroup, CLASS, SITE_CLASS) %>% 
    summarise(AVG = mean(value)) %>% 
  separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>% 
  mutate(REGION = case_when(
    SITE == "GordaRidge" ~ "Gorda Ridge",
    SITE %in% mcr ~ "Mid-Cayman Rise",
    SITE == "Axial" ~ "Axial")) %>% 
  mutate(VENTNAME = case_when(
    REGION == "Gorda Ridge" ~ VENT,
    REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
    REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
  )) %>% select(-Sample_tmp) %>% 
  unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>% head
## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Supergroup', 'CLASS'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## # A tibble: 6 × 12
## # Groups:   FeatureID, VENT, Supergroup, CLASS [6]
##   FeatureID    SITE   SAMPLETYPE YEAR  VENT    Supergroup CLASS SITE_CLASS   AVG
##   <chr>        <chr>  <chr>      <chr> <chr>   <chr>      <chr> <chr>      <dbl>
## 1 0016c920953… VonDa… Vent       2020  Rav2    Rhizaria   Vent… Mid-Cayma…    45
## 2 004b23363bd… Picca… Vent       2020  Shrimp… Alveolata  Vent… Mid-Cayma…    21
## 3 006f5664d25… Picca… Vent       2020  LotsOS… Stramenop… Vent… Mid-Cayma…    52
## 4 006f5664d25… Picca… Vent       2020  Shrimp… Stramenop… Vent… Mid-Cayma…    89
## 5 006f5664d25… VonDa… Vent       2020  ArrowL… Stramenop… Vent… Mid-Cayma…   315
## 6 006f5664d25… VonDa… Vent       2020  OldMan… Stramenop… Vent… Mid-Cayma…   181
## # … with 3 more variables: SAMPLE <chr>, REGION <chr>, VENTNAME <chr>

12 Incorporate grazing data and metadata

Compare bubble plots with various metadata parameters.

Import grazing data as output from previous, plot with bubbles underneath of specific parameters.